Back to content page

Learning objective

  • Understand the bias-variance tradeoff in model selection
  • Perform model selection using ridge, LASSO and elastic net regressions
  • Recognize pros and cons of various model selection approaches

1 Model selection

Princples

  • Goodness of fit
    • How similar the observed and predicted values are
    • Measured by R-squared, AIC, BIC, etc
  • Parsimony (simple instead of complicated model)
    • Occam’s razor: for models with similar goodness of fit, the simpler one is preferred
  • 2 Data sets used
    • Training set for developing the model
    • Testing set for examining the model in another data set

EX of overfitting

Training dataset => overfitting if 8 degree of polynomials

Training dataset => also shows overfitting

Testing dataset => 8 degree of polynomials fit poorly

Trade off between bias and variance

  • Bias - how far the predicted values deviated from the true
  • Variance - how scattered the predicted values are
    • Ideally low bias and low variance
    • But in practical there is a tradeoff between bias and variance

Algebraic and graphical demonstration of trade-off between bias and variance

EX – protate cancer

Prostate <- read.csv("Data/prostate.csv")

# Divide into training and testing sets; Randomly select 20 rows to be testing dataset
set.seed(23456)
TestingIndex <- sample(1:nrow(Prostate), 20)
Training <- Prostate[-TestingIndex, ]
Testing <- Prostate[TestingIndex, ]

Automated selection procedure

  • Backward selection
    1. fit a regression lm(y~x)
      Alternative: lm(y~.) A period “.” means all other variables in the data set, a minus “–” excludes a variable (more convenient)
    2. backward selection step(function, direction="backward")
      Alternative: Add F-test or Chi-squared test in the output to see if there’s any inconsistency in model selection (like the last model in f test, p-val actually = 0.057445 >0.05)
  • Forward selection
  • Stepwise selection
## Backward selection (start from the most complicated model first)
Full <- lm(lpsa ~ lcavol + lweight + age + lbph + svi + lcp + gleason + pgg45,
data = Training)
Backward1 <- step(Full, direction = "backward")

# Alternative for backward selection
Alt <- lm(lpsa ~ . - age - svi, data = Training) #age and svi are removed in this case
Backward2 <- step(Full, direction = "backward", test = "F")
Backward3 <- step(Full, direction = "backward", test = "Chisq")

## Forward selection
Null <- lm(lpsa ~ 1, data = Training)
Forward <- step(Null, scope = list(lower = Null, upper = Full),
direction = "forward")

## Stepwise selection
Stepwise1 <- step(Full, direction = "both")
Stepwise2 <- step(Null, scope = list(lower = Null, upper = Full),
direction = "both")
## Start:  AIC=-39.31
## lpsa ~ lcavol + lweight + age + lbph + svi + lcp + gleason + 
##     pgg45
## 
##           Df Sum of Sq    RSS     AIC
## - gleason  1    0.1810 36.760 -40.933
## - lcp      1    0.2693 36.849 -40.748
## - pgg45    1    0.3331 36.912 -40.615
## <none>                 36.579 -39.313
## - age      1    1.1468 37.726 -38.936
## - lbph     1    2.3121 38.891 -36.594
## - svi      1    3.8819 40.461 -33.547
## - lweight  1    3.9289 40.508 -33.457
## - lcavol   1   20.3290 56.908  -7.282
## 
## Step:  AIC=-40.93
## lpsa ~ lcavol + lweight + age + lbph + svi + lcp + pgg45
## 
##           Df Sum of Sq    RSS     AIC
## - lcp      1    0.3002 37.060 -42.307
## <none>                 36.760 -40.933
## - age      1    1.1118 37.872 -40.638
## - pgg45    1    1.4277 38.188 -39.999
## - lbph     1    2.2925 39.053 -38.275
## - lweight  1    3.7861 40.546 -35.385
## - svi      1    3.8174 40.578 -35.325
## - lcavol   1   21.7089 58.469  -7.198
## 
## Step:  AIC=-42.31
## lpsa ~ lcavol + lweight + age + lbph + svi + pgg45
## 
##           Df Sum of Sq    RSS     AIC
## - age      1    0.9174 37.978 -42.424
## <none>                 37.060 -42.307
## - pgg45    1    1.1290 38.190 -41.996
## - lbph     1    2.2019 39.262 -39.863
## - svi      1    3.6525 40.713 -37.069
## - lweight  1    3.8516 40.912 -36.693
## - lcavol   1   25.1255 62.186  -4.453
## 
## Step:  AIC=-42.42
## lpsa ~ lcavol + lweight + lbph + svi + pgg45
## 
##           Df Sum of Sq    RSS     AIC
## - pgg45    1    0.8161 38.794 -42.787
## <none>                 37.978 -42.424
## - lbph     1    1.6290 39.607 -41.190
## - lweight  1    3.3245 41.302 -37.962
## - svi      1    3.9171 41.895 -36.865
## - lcavol   1   24.4609 62.439  -6.141
## 
## Step:  AIC=-42.79
## lpsa ~ lcavol + lweight + lbph + svi
## 
##           Df Sum of Sq    RSS     AIC
## <none>                 38.794 -42.787
## - lbph     1    2.0087 40.803 -40.900
## - lweight  1    2.9786 41.773 -39.091
## - svi      1    5.8688 44.663 -33.939
## - lcavol   1   27.6768 66.471  -3.322
## Start:  AIC=-39.31
## lpsa ~ lcavol + lweight + age + lbph + svi + lcp + gleason + 
##     pgg45
## 
##           Df Sum of Sq    RSS     AIC F value    Pr(>F)    
## - gleason  1    0.1810 36.760 -40.933  0.3366  0.563740    
## - lcp      1    0.2693 36.849 -40.748  0.5007  0.481611    
## - pgg45    1    0.3331 36.912 -40.615  0.6192  0.434086    
## <none>                 36.579 -39.313                      
## - age      1    1.1468 37.726 -38.936  2.1320  0.148860    
## - lbph     1    2.3121 38.891 -36.594  4.2982  0.041944 *  
## - svi      1    3.8819 40.461 -33.547  7.2163  0.009072 ** 
## - lweight  1    3.9289 40.508 -33.457  7.3038  0.008682 ** 
## - lcavol   1   20.3290 56.908  -7.282 37.7911 4.689e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=-40.93
## lpsa ~ lcavol + lweight + age + lbph + svi + lcp + pgg45
## 
##           Df Sum of Sq    RSS     AIC F value    Pr(>F)    
## - lcp      1    0.3002 37.060 -42.307  0.5635  0.455425    
## <none>                 36.760 -40.933                      
## - age      1    1.1118 37.872 -40.638  2.0869  0.153094    
## - pgg45    1    1.4277 38.188 -39.999  2.6799  0.106175    
## - lbph     1    2.2925 39.053 -38.275  4.3031  0.041776 *  
## - lweight  1    3.7861 40.546 -35.385  7.1066  0.009558 ** 
## - svi      1    3.8174 40.578 -35.325  7.1653  0.009279 ** 
## - lcavol   1   21.7089 58.469  -7.198 40.7482 1.709e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=-42.31
## lpsa ~ lcavol + lweight + age + lbph + svi + pgg45
## 
##           Df Sum of Sq    RSS     AIC F value   Pr(>F)    
## - age      1    0.9174 37.978 -42.424  1.7327 0.192360    
## <none>                 37.060 -42.307                     
## - pgg45    1    1.1290 38.190 -41.996  2.1325 0.148675    
## - lbph     1    2.2019 39.262 -39.863  4.1589 0.045193 *  
## - svi      1    3.6525 40.713 -37.069  6.8990 0.010587 *  
## - lweight  1    3.8516 40.912 -36.693  7.2750 0.008754 ** 
## - lcavol   1   25.1255 62.186  -4.453 47.4572 1.99e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=-42.42
## lpsa ~ lcavol + lweight + lbph + svi + pgg45
## 
##           Df Sum of Sq    RSS     AIC F value    Pr(>F)    
## - pgg45    1    0.8161 38.794 -42.787  1.5257  0.220827    
## <none>                 37.978 -42.424                      
## - lbph     1    1.6290 39.607 -41.190  3.0454  0.085294 .  
## - lweight  1    3.3245 41.302 -37.962  6.2153  0.014999 *  
## - svi      1    3.9171 41.895 -36.865  7.3230  0.008519 ** 
## - lcavol   1   24.4609 62.439  -6.141 45.7300 3.193e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=-42.79
## lpsa ~ lcavol + lweight + lbph + svi
## 
##           Df Sum of Sq    RSS     AIC F value    Pr(>F)    
## <none>                 38.794 -42.787                      
## - lbph     1    2.0087 40.803 -40.900  3.7280  0.057445 .  
## - lweight  1    2.9786 41.773 -39.091  5.5281  0.021454 *  
## - svi      1    5.8688 44.663 -33.939 10.8922  0.001504 ** 
## - lcavol   1   27.6768 66.471  -3.322 51.3670 5.432e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Start:  AIC=-39.31
## lpsa ~ lcavol + lweight + age + lbph + svi + lcp + gleason + 
##     pgg45
## 
##           Df Sum of Sq    RSS     AIC  Pr(>Chi)    
## - gleason  1    0.1810 36.760 -40.933  0.537516    
## - lcp      1    0.2693 36.849 -40.748  0.452296    
## - pgg45    1    0.3331 36.912 -40.615  0.403471    
## <none>                 36.579 -39.313              
## - age      1    1.1468 37.726 -38.936  0.123129    
## - lbph     1    2.3121 38.891 -36.594  0.029823 *  
## - svi      1    3.8819 40.461 -33.547  0.005323 ** 
## - lweight  1    3.9289 40.508 -33.457  0.005066 ** 
## - lcavol   1   20.3290 56.908  -7.282 5.425e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=-40.93
## lpsa ~ lcavol + lweight + age + lbph + svi + lcp + pgg45
## 
##           Df Sum of Sq    RSS     AIC  Pr(>Chi)    
## - lcp      1    0.3002 37.060 -42.307  0.428742    
## <none>                 36.760 -40.933              
## - age      1    1.1118 37.872 -40.638  0.129847    
## - pgg45    1    1.4277 38.188 -39.999  0.086733 .  
## - lbph     1    2.2925 39.053 -38.275  0.030906 *  
## - lweight  1    3.7861 40.546 -35.385  0.006007 ** 
## - svi      1    3.8174 40.578 -35.325  0.005812 ** 
## - lcavol   1   21.7089 58.469  -7.198 2.261e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=-42.31
## lpsa ~ lcavol + lweight + age + lbph + svi + pgg45
## 
##           Df Sum of Sq    RSS     AIC  Pr(>Chi)    
## - age      1    0.9174 37.978 -42.424  0.170020    
## <none>                 37.060 -42.307              
## - pgg45    1    1.1290 38.190 -41.996  0.128480    
## - lbph     1    2.2019 39.262 -39.863  0.035023 *  
## - svi      1    3.6525 40.713 -37.069  0.007139 ** 
## - lweight  1    3.8516 40.912 -36.693  0.005794 ** 
## - lcavol   1   25.1255 62.186  -4.453 2.737e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=-42.42
## lpsa ~ lcavol + lweight + lbph + svi + pgg45
## 
##           Df Sum of Sq    RSS     AIC  Pr(>Chi)    
## - pgg45    1    0.8161 38.794 -42.787  0.200720    
## <none>                 37.978 -42.424              
## - lbph     1    1.6290 39.607 -41.190  0.072130 .  
## - lweight  1    3.3245 41.302 -37.962  0.011023 *  
## - svi      1    3.9171 41.895 -36.865  0.005973 ** 
## - lcavol   1   24.4609 62.439  -6.141 6.119e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Step:  AIC=-42.79
## lpsa ~ lcavol + lweight + lbph + svi
## 
##           Df Sum of Sq    RSS     AIC  Pr(>Chi)    
## <none>                 38.794 -42.787              
## - lbph     1    2.0087 40.803 -40.900 0.0486580 *  
## - lweight  1    2.9786 41.773 -39.091 0.0170030 *  
## - svi      1    5.8688 44.663 -33.939 0.0009893 ***
## - lcavol   1   27.6768 66.471  -3.322   1.2e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Start:  AIC=32.5
## lpsa ~ 1
## 
##           Df Sum of Sq     RSS     AIC
## + lcavol   1    61.297  53.121 -24.585
## + lcp      1    35.552  78.865   5.843
## + svi      1    35.144  79.273   6.241
## + pgg45    1    20.457  93.961  19.328
## + gleason  1    16.072  98.346  22.841
## + lweight  1    12.519 101.899  25.574
## + lbph     1     5.991 108.426  30.354
## <none>                 114.418  32.496
## + age      1     2.739 111.679  32.630
## 
## Step:  AIC=-24.59
## lpsa ~ lcavol
## 
##           Df Sum of Sq    RSS     AIC
## + lweight  1    6.8011 46.320 -33.134
## + lbph     1    5.4723 47.648 -30.957
## + svi      1    5.1858 47.935 -30.495
## + pgg45    1    2.6036 50.517 -26.455
## <none>                 53.121 -24.585
## + gleason  1    1.2538 51.867 -24.425
## + lcp      1    1.2484 51.872 -24.417
## + age      1    0.1665 52.954 -22.827
## 
## Step:  AIC=-33.13
## lpsa ~ lcavol + lweight
## 
##           Df Sum of Sq    RSS     AIC
## + svi      1    5.5170 40.803 -40.900
## + pgg45    1    3.2417 43.078 -36.721
## + gleason  1    2.1983 44.121 -34.878
## + lcp      1    1.8802 44.439 -34.325
## + lbph     1    1.6569 44.663 -33.939
## <none>                 46.320 -33.134
## + age      1    0.1505 46.169 -31.385
## 
## Step:  AIC=-40.9
## lpsa ~ lcavol + lweight + svi
## 
##           Df Sum of Sq    RSS     AIC
## + lbph     1   2.00867 38.794 -42.787
## + pgg45    1   1.19581 39.607 -41.190
## + gleason  1   1.13789 39.665 -41.077
## <none>                 40.803 -40.900
## + age      1   0.11618 40.686 -39.119
## + lcp      1   0.01958 40.783 -38.937
## 
## Step:  AIC=-42.79
## lpsa ~ lcavol + lweight + svi + lbph
## 
##           Df Sum of Sq    RSS     AIC
## <none>                 38.794 -42.787
## + gleason  1   0.86851 37.925 -42.530
## + pgg45    1   0.81611 37.978 -42.424
## + age      1   0.60442 38.190 -41.996
## + lcp      1   0.00350 38.790 -40.794
## Start:  AIC=-39.31
## lpsa ~ lcavol + lweight + age + lbph + svi + lcp + gleason + 
##     pgg45
## 
##           Df Sum of Sq    RSS     AIC
## - gleason  1    0.1810 36.760 -40.933
## - lcp      1    0.2693 36.849 -40.748
## - pgg45    1    0.3331 36.912 -40.615
## <none>                 36.579 -39.313
## - age      1    1.1468 37.726 -38.936
## - lbph     1    2.3121 38.891 -36.594
## - svi      1    3.8819 40.461 -33.547
## - lweight  1    3.9289 40.508 -33.457
## - lcavol   1   20.3290 56.908  -7.282
## 
## Step:  AIC=-40.93
## lpsa ~ lcavol + lweight + age + lbph + svi + lcp + pgg45
## 
##           Df Sum of Sq    RSS     AIC
## - lcp      1    0.3002 37.060 -42.307
## <none>                 36.760 -40.933
## - age      1    1.1118 37.872 -40.638
## - pgg45    1    1.4277 38.188 -39.999
## + gleason  1    0.1810 36.579 -39.313
## - lbph     1    2.2925 39.053 -38.275
## - lweight  1    3.7861 40.546 -35.385
## - svi      1    3.8174 40.578 -35.325
## - lcavol   1   21.7089 58.469  -7.198
## 
## Step:  AIC=-42.31
## lpsa ~ lcavol + lweight + age + lbph + svi + pgg45
## 
##           Df Sum of Sq    RSS     AIC
## - age      1    0.9174 37.978 -42.424
## <none>                 37.060 -42.307
## - pgg45    1    1.1290 38.190 -41.996
## + lcp      1    0.3002 36.760 -40.933
## + gleason  1    0.2119 36.849 -40.748
## - lbph     1    2.2019 39.262 -39.863
## - svi      1    3.6525 40.713 -37.069
## - lweight  1    3.8516 40.912 -36.693
## - lcavol   1   25.1255 62.186  -4.453
## 
## Step:  AIC=-42.42
## lpsa ~ lcavol + lweight + lbph + svi + pgg45
## 
##           Df Sum of Sq    RSS     AIC
## - pgg45    1    0.8161 38.794 -42.787
## <none>                 37.978 -42.424
## + age      1    0.9174 37.060 -42.307
## - lbph     1    1.6290 39.607 -41.190
## + gleason  1    0.1646 37.813 -40.758
## + lcp      1    0.1057 37.872 -40.638
## - lweight  1    3.3245 41.302 -37.962
## - svi      1    3.9171 41.895 -36.865
## - lcavol   1   24.4609 62.439  -6.141
## 
## Step:  AIC=-42.79
## lpsa ~ lcavol + lweight + lbph + svi
## 
##           Df Sum of Sq    RSS     AIC
## <none>                 38.794 -42.787
## + gleason  1    0.8685 37.925 -42.530
## + pgg45    1    0.8161 37.978 -42.424
## + age      1    0.6044 38.190 -41.996
## - lbph     1    2.0087 40.803 -40.900
## + lcp      1    0.0035 38.790 -40.794
## - lweight  1    2.9786 41.773 -39.091
## - svi      1    5.8688 44.663 -33.939
## - lcavol   1   27.6768 66.471  -3.322
## Start:  AIC=32.5
## lpsa ~ 1
## 
##           Df Sum of Sq     RSS     AIC
## + lcavol   1    61.297  53.121 -24.585
## + lcp      1    35.552  78.865   5.843
## + svi      1    35.144  79.273   6.241
## + pgg45    1    20.457  93.961  19.328
## + gleason  1    16.072  98.346  22.841
## + lweight  1    12.519 101.899  25.574
## + lbph     1     5.991 108.426  30.354
## <none>                 114.418  32.496
## + age      1     2.739 111.679  32.630
## 
## Step:  AIC=-24.59
## lpsa ~ lcavol
## 
##           Df Sum of Sq     RSS     AIC
## + lweight  1     6.801  46.320 -33.134
## + lbph     1     5.472  47.648 -30.957
## + svi      1     5.186  47.935 -30.495
## + pgg45    1     2.604  50.517 -26.455
## <none>                  53.121 -24.585
## + gleason  1     1.254  51.867 -24.425
## + lcp      1     1.248  51.872 -24.417
## + age      1     0.166  52.954 -22.827
## - lcavol   1    61.297 114.418  32.496
## 
## Step:  AIC=-33.13
## lpsa ~ lcavol + lweight
## 
##           Df Sum of Sq     RSS     AIC
## + svi      1     5.517  40.803 -40.900
## + pgg45    1     3.242  43.078 -36.721
## + gleason  1     2.198  44.121 -34.878
## + lcp      1     1.880  44.439 -34.325
## + lbph     1     1.657  44.663 -33.939
## <none>                  46.320 -33.134
## + age      1     0.150  46.169 -31.385
## - lweight  1     6.801  53.121 -24.585
## - lcavol   1    55.579 101.899  25.574
## 
## Step:  AIC=-40.9
## lpsa ~ lcavol + lweight + svi
## 
##           Df Sum of Sq    RSS     AIC
## + lbph     1    2.0087 38.794 -42.787
## + pgg45    1    1.1958 39.607 -41.190
## + gleason  1    1.1379 39.665 -41.077
## <none>                 40.803 -40.900
## + age      1    0.1162 40.686 -39.119
## + lcp      1    0.0196 40.783 -38.937
## - svi      1    5.5170 46.320 -33.134
## - lweight  1    7.1323 47.935 -30.495
## - lcavol   1   27.4874 68.290  -3.243
## 
## Step:  AIC=-42.79
## lpsa ~ lcavol + lweight + svi + lbph
## 
##           Df Sum of Sq    RSS     AIC
## <none>                 38.794 -42.787
## + gleason  1    0.8685 37.925 -42.530
## + pgg45    1    0.8161 37.978 -42.424
## + age      1    0.6044 38.190 -41.996
## - lbph     1    2.0087 40.803 -40.900
## + lcp      1    0.0035 38.790 -40.794
## - lweight  1    2.9786 41.773 -39.091
## - svi      1    5.8688 44.663 -33.939
## - lcavol   1   27.6768 66.471  -3.322

2a Penalized regression (RIDGE, LAASSO)

Intro

OLS

  • Previously, we learnt to use OLS to give unbiased estimate
  • However, sometimes biased with low variance can give more accurate estimate
  • Penalized regression can be used to introduce bias to reduce variance

Penalized regression

We can add constraint to OLS (adding some restriction terms can make the model less complicated => generate some bias but lower variance)

Ridge and LASSO

Add constraint term

As mentioned in last part, “t” is the penalty parameter. We can transform t to λ and use it as constraint term. We add a constraint term then directly minimize the whole function.

  • As λ => 0, ridge coefficients => OLS coefficients
    (Become ordinary linear squares)
  • As λ => ∞, ridge coefficients => 0
    (Model will approach beta as zero to minimize the function (reduce the penalty))

Ridge vs LASSO resgression

Ridge regression

  • Coefficients shrink towards but never reach 0.
  • All predictor variables retain, cannot produce a parsimonious model (I.e. So the number of predictor won’t be reduced => complexity of model didn’t change)

LASSO = Least Absolute Shrinkage and Selection Operator

Determination of λ

The determination of λ is based on k-fold cross-validation

  • Randomly divide the data set into k (usually 10) equal-sized subsets
    • Use (k – 1) subsets for estimation of model (training)
    • The remaining subset for computation of prediction error (testing)

  • Calculate the mean and standard error of mean squared error (MSE) for each λ
  • We select the maximum value of λ within 1 SE of the minimum – 1-SE rule
    (If directly select minimum value of lambda might cause overfitting so use SE of minimum to select lambda)

Note: if k is set as sample size, it is called leave-one-out validation.

Ridge and LASSO code (R)

Package: glmnet

  • glmnet(x, y, alpha, lambda, family…) to fit regression
    • x: matrix of predictor variables
    • y: response variable
    • alpha: 0 = Ridge regression, 1 = LASSO regression
    • lambda: user-supplied λ values
    • family: Response type, e.g. “gaussian”, “binomial”, etc
  • cv.glmnet(x, y, alpha) to determine λ for Ridge regression

Ridge (R)

library(glmnet)

## 1a Specifying λ
x <- model.matrix(lpsa ~ . - 1, data = Training)#glmnet alr include intercept so remove it in x matrix
y <- Training$lpsa
Ridge1 <- glmnet(x, y, alpha = 0, lambda = c(0.5, 1))
Ridge1
coef(Ridge1)

Note the sequence of lambda

Note the sequence of lambda
## 1b Not specifying λ
Ridge2 <- glmnet(x, y, alpha = 0)
#plot(Ridge2, xvar="lambda", label=TRUE)
#plot(Ridge2, xvar="dev", label=TRUE)

## 1c Cross-validation to determine λ
set.seed(56789)
Ridge2.cv <- cv.glmnet(x, y, alpha = 0)
#plot(Ridge2.cv)
#Mean MSE (red dot) ± SE (vertical bar)
#Left vertical line: minimum MSE
#Right vertical line: maximum value of λ within 1 SE of the minimum (1-SE rule)
min(Ridge2.cv$cvm)#Minimum MSE
log(Ridge2.cv$lambda.min) #λ to obtain Minimum MSE
log(Ridge2.cv$lambda.1se) #λ selected under 1-SE rule

## 2 Obtain new beta coefficients by 1-SE rule
coef(Ridge2, s = Ridge2.cv$lambda.1se) #supply the λ by specifying "s"

## 3 Predicted values on testing set
x.test <- model.matrix(lpsa ~ . - 1, data = Testing)
Ridge2.Pred <- predict(Ridge2, newx = x.test, s = Ridge2.cv$lambda.1se)
## [1] 0.615571
## [1] -2.416623
## [1] -0.3698812
## 9 x 1 sparse Matrix of class "dgCMatrix"
##                       s1
## (Intercept) -0.234804862
## lcavol       0.328430341
## lweight      0.387430007
## age         -0.002396460
## lbph         0.085566483
## svi          0.500485901
## lcp          0.091304642
## gleason      0.125769781
## pgg45        0.003117404

LASSO (R)

## 1a Specifying λ
LASSO1 <- glmnet(x, y, alpha = 1, lambda = c(0.25, 0.5, 0.75, 1)) #"." = dropped from the model; Highest penalty results in lowest no. of predictors

## 1b Not specifying λ
LASSO2 <- glmnet(x, y, alpha = 1)
#plot(LASSO2, xvar="lambda", label=TRUE)
#plot(LASSO2, xvar="dev", label=TRUE)

## 1c Cross-validation to determine λ
set.seed(54321)
LASSO2.cv <- cv.glmnet(x, y, alpha = 1)
#plot(LASSO2.cv)
#Mean MSE (red dot) ± SE (vertical bar)
#Left vertical line: minimum MSE
#Right vertical line: maximum value of λ within 1 SE of the minimum (1-SE rule)
min(LASSO2.cv$cvm)#Minimum MSE
log(LASSO2.cv$lambda.min) #λ to obtain Minimum MSE
log(LASSO2.cv$lambda.1se) #λ selected under 1-SE rule

## 2 Obtain new beta coefficients by 1-SE rule
coef(LASSO2, s = LASSO2.cv$lambda.1se) #supply the λ by specifying "s"

## 3 Predicted values on testing set
LASSO2.Pred <- predict(LASSO2, newx = x.test, s = LASSO2.cv$lambda.1se)
## [1] 0.6172947
## [1] -3.370219
## [1] -1.416511
## 9 x 1 sparse Matrix of class "dgCMatrix"
##                    s1
## (Intercept) 1.1283118
## lcavol      0.4694206
## lweight     0.1733541
## age         .        
## lbph        .        
## svi         0.3438657
## lcp         .        
## gleason     .        
## pgg45       .

2b Advantages of penalized regression

A Multicollinearity

If predictor variables are correlated => results in large variance and make estimation unstable
Compared to OLS (conventional lr), penalized regression give robust estimate => estimate not too sensitive to some changes/errors/outliers

Prostate.col <- Prostate

## create a new variable by adding two variables in the data set (perfect linerality) => fit linear regression
Prostate.col$new1 <- Prostate.col$lcavol + Prostate.col$lweight
lm(formula = lpsa ~ lcavol + lweight + new1, data = Prostate.col)

## add some noise to the new variable => fit linear regression
set.seed(234567)
Prostate.col$new2 <- Prostate.col$new1 + rnorm(nrow(Prostate.col), 0, 0.1)
lm(formula = lpsa ~ lcavol + lweight + new2, data = Prostate.col)
#LR results in unstable estimate for new variable (from NA to 1.2719) if hv collinearity issue
#OLS is sensitive to multicollinearity
## 
## Call:
## lm(formula = lpsa ~ lcavol + lweight + new1, data = Prostate.col)
## 
## Coefficients:
## (Intercept)       lcavol      lweight         new1  
##     -0.3026       0.6775       0.5109           NA  
## 
## 
## Call:
## lm(formula = lpsa ~ lcavol + lweight + new2, data = Prostate.col)
## 
## Coefficients:
## (Intercept)       lcavol      lweight         new2  
##     -0.2495      -0.5894      -0.7710       1.2719

Comparison of estimates from ridge regression:
Similar/robustness of estimate => not too sensitive to some changes/errors/outliers

B Number of predictors

We usually assume sample size (n) is greater than no. of predictor (p)
However, in analysis like genomic study, 10000 gene predictor (p) is used to explore risk of rare disease with low incidence (n), where p>n
Ridge and LASSO regression can give estimates even when n<p

## Create a data set with sample size = 6
set.seed(21098)
SmallIndex <- sample(1:nrow(Prostate), 6)
Small <- Prostate[SmallIndex, ]

## 1 OLS -- only get 6 estimate coz cannot hv estimate > sample size
lm(formula = lpsa ~ lcavol + lweight + age + lbph + svi + lcp + gleason + pgg45, data = Small)

## 2 Ridge and LASSO -- still can give estimate
x.Small <- model.matrix(lpsa ~ lcavol + lweight + age + lbph + svi + lcp +
gleason + pgg45 - 1, data = Small)
y.Small <- Small$lpsa
Small.Ridge <- glmnet(x.Small, y.Small, alpha = 0, lambda = c(0.1, 0.01))
coef(Small.Ridge)
Small.LASSO <- glmnet(x.Small, y.Small, alpha = 1, lambda = c(0.1, 0.01))
coef(Small.LASSO)
## 
## Call:
## lm(formula = lpsa ~ lcavol + lweight + age + lbph + svi + lcp + 
##     gleason + pgg45, data = Small)
## 
## Coefficients:
## (Intercept)       lcavol      lweight          age         lbph          svi  
##    -5.67764      0.69457      0.70011      0.07270     -0.03293           NA  
##         lcp      gleason        pgg45  
##     0.15074           NA           NA  
## 
## 9 x 2 sparse Matrix of class "dgCMatrix"
##                       s0          s1
## (Intercept) -3.983189980 -3.34701545
## lcavol       0.634838300  0.71370283
## lweight      0.377882039  0.39886193
## age          0.052767193  0.04535218
## lbph         0.043633242  0.03597348
## svi          .            .         
## lcp          0.337259634  0.42433255
## gleason      0.156275254  0.12605415
## pgg45        0.005244347  0.00601659
## 9 x 2 sparse Matrix of class "dgCMatrix"
##                      s0            s1
## (Intercept) -4.61897188 -5.5170679541
## lcavol       0.62848146  0.6821712429
## lweight      0.45130177  0.5797324637
## age          0.06858169  0.0746291622
## lbph         .           .           
## svi          .           .           
## lcp          .           0.0129012020
## gleason      .           .           
## pgg45        .           0.0005385419

Comparison

3 Elastic net (ELASTIC NET)

Wt is elastic net

Elastic net is the sum of ridge and LASSO estimates with weighting between these two penalty functions.
α: 0 = Ridge regression, 1 = LASSO regression, 0 < α < 1 = Elastic net

Elastic net (R)

x <- model.matrix(lpsa ~ . - 1, data = Training)
y <- Training$lpsa
x.test <- model.matrix(lpsa ~ . - 1, data = Testing)

library(glmnet)
Elastic000 <- glmnet(x, y, alpha = 0)
Elastic005 <- glmnet(x, y, alpha = 0.05)
Elastic010 <- glmnet(x, y, alpha = 0.1)
Elastic020 <- glmnet(x, y, alpha = 0.2)
Elastic050 <- glmnet(x, y, alpha = 0.5)
Elastic080 <- glmnet(x, y, alpha = 0.8)
Elastic090 <- glmnet(x, y, alpha = 0.9)
Elastic095 <- glmnet(x, y, alpha = 0.95)
Elastic100 <- glmnet(x, y, alpha = 1)

par(mfrow=c(3,3), mar=c(5,4,2,1))
plot(Elastic000, xvar="lambda", label=TRUE)
plot(Elastic005, xvar="lambda", label=TRUE)
plot(Elastic010, xvar="lambda", label=TRUE)
plot(Elastic020, xvar="lambda", label=TRUE)
plot(Elastic050, xvar="lambda", label=TRUE)
plot(Elastic080, xvar="lambda", label=TRUE)
plot(Elastic090, xvar="lambda", label=TRUE)
plot(Elastic095, xvar="lambda", label=TRUE)
plot(Elastic100, xvar="lambda", label=TRUE)

Elastic net (R)2

## 1 Cross-validation to determine λ
set.seed(76543)
Elastic.cv <- cv.glmnet(x, y, alpha = 0.5)
#plot(Elastic.cv)

min(Elastic.cv$cvm)
log(Elastic.cv$lambda.min)
log(Elastic.cv$lambda.1se)

## 2 Obtain new beta coefficients by 1-SE rule
Elastic050 <- glmnet(x, y, alpha = 0.5)
coef(Elastic050, s = Elastic.cv$lambda.1se)

## 3 Predicted values on testing set
Elastic.Pred <- predict(Elastic050, newx = x.test, s = Elastic.cv$lambda.1se)
## [1] 0.6108528
## [1] -2.956173
## [1] -1.095499
## 9 x 1 sparse Matrix of class "dgCMatrix"
##                      s1
## (Intercept) 0.818501440
## lcavol      0.425218017
## lweight     0.258692154
## age         .          
## lbph        0.030889415
## svi         0.447316008
## lcp         0.015142118
## gleason     .          
## pgg45       0.001737656

Application

2 real-life examples.

4 Practical 8

Prepare data sets

LowBWT <- read.csv("Data/lowbwt.csv")
LowBWT$LOW <- factor(LowBWT$LOW) #categorize variables
LowBWT$RACE <- factor(LowBWT$RACE)
LowBWT$SMOKE <- factor(LowBWT$SMOKE)
LowBWT$HT <- factor(LowBWT$HT)
LowBWT$UI <- factor(LowBWT$UI)

# Fitting continuous outcome, variable "BWT"
x.con = model.matrix(BWT ~ . - 1 - ID - LOW, data = LowBWT)
y.con = LowBWT$BWT
# Fitting binary outcome, variable "LOW"
x.bin = model.matrix(LOW ~ . - 1 - ID - BWT, data = LowBWT)
y.bin = LowBWT$LOW

Continuous outcome, Ridge regression

Ridge1 <- glmnet(x.con, y.con, alpha = 0)
#plot(Ridge1, xvar="lambda", label=TRUE)
#plot(Ridge1, xvar="dev", label=TRUE)
set.seed(56789)
Ridge1.cv = cv.glmnet(x.con, y.con, alpha = 0)
#plot(Ridge1.cv)
coef(Ridge1, s = Ridge1.cv$lambda.1se)
## 11 x 1 sparse Matrix of class "dgCMatrix"
##                       s1
## (Intercept) 2847.7067856
## AGE            1.6026551
## LWT            0.7244884
## RACE1         50.8523715
## RACE2        -44.7463234
## RACE3        -32.3425866
## SMOKE1       -51.2953483
## PTL          -34.7321318
## HT1          -80.3629270
## UI1          -96.8631102
## FTV            4.7682107

Continuous outcome, LASSO

LASSO1 <- glmnet(x.con, y.con, alpha = 1)
#plot(LASSO1, xvar="lambda", label=TRUE)
#plot(LASSO1, xvar="dev", label=TRUE)
set.seed(65432)
LASSO1.cv = cv.glmnet(x.con, y.con, alpha = 1)
#plot(LASSO1.cv)
coef(LASSO1, s = LASSO1.cv$lambda.1se)
## 11 x 1 sparse Matrix of class "dgCMatrix"
##                       s1
## (Intercept) 2919.9406349
## AGE            .        
## LWT            0.2770576
## RACE1        126.8831127
## RACE2          .        
## RACE3          .        
## SMOKE1       -89.3331248
## PTL            .        
## HT1          -29.7483236
## UI1         -262.1262526
## FTV            .

Binary outcome, Ridge regression

Ridge2 <- glmnet(x.bin, y.bin, alpha = 0, family = "binomial")
#plot(Ridge2, xvar="lambda", label=TRUE)
#plot(Ridge2, xvar="dev", label=TRUE)
set.seed(56789)
Ridge2.cv = cv.glmnet(x.bin, y.bin, alpha = 0, family = "binomial")
#plot(Ridge2.cv)
coef(Ridge2, s = Ridge2.cv$lambda.1se)
## 11 x 1 sparse Matrix of class "dgCMatrix"
##                       s1
## (Intercept) -0.560026872
## AGE         -0.005036865
## LWT         -0.001320565
## RACE1       -0.072877588
## RACE2        0.067682691
## RACE3        0.044508306
## SMOKE1       0.082406149
## PTL          0.093369023
## HT1          0.158414316
## UI1          0.110805589
## FTV         -0.012006530

Binary outcome, LASSO

LASSO2 <- glmnet(x.bin, y.bin, alpha = 1, family = "binomial")
#plot(LASSO2, xvar="lambda", label=TRUE)
#plot(LASSO2, xvar="dev", label=TRUE)
set.seed(65432)
LASSO2.cv = cv.glmnet(x.bin, y.bin, alpha = 1, family = "binomial")
#plot(LASSO2.cv)
coef(LASSO2, s = LASSO2.cv$lambda.1se)
## 11 x 1 sparse Matrix of class "dgCMatrix"
##                       s1
## (Intercept) -0.402500274
## AGE          .          
## LWT         -0.003655509
## RACE1       -0.248026319
## RACE2        .          
## RACE3        .          
## SMOKE1       0.229056074
## PTL          0.245427433
## HT1          0.465913634
## UI1          0.214356081
## FTV          .